[Frontend] add logprob, compression_rate to 'verbose_json' features (#31059)
Signed-off-by: sangbumlikeagod <oironese@naver.com> Signed-off-by: sangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com>
This commit is contained in:
@@ -271,13 +271,13 @@ class TranscriptionSegment(OpenAIBaseModel):
|
||||
id: int
|
||||
"""Unique identifier of the segment."""
|
||||
|
||||
avg_logprob: float | None = None
|
||||
avg_logprob: float
|
||||
"""Average logprob of the segment.
|
||||
|
||||
If the value is lower than -1, consider the logprobs failed.
|
||||
"""
|
||||
|
||||
compression_ratio: float | None = None
|
||||
compression_ratio: float
|
||||
"""Compression ratio of the segment.
|
||||
|
||||
If the value is greater than 2.4, consider the compression failed.
|
||||
@@ -487,13 +487,13 @@ class TranslationSegment(OpenAIBaseModel):
|
||||
id: int
|
||||
"""Unique identifier of the segment."""
|
||||
|
||||
avg_logprob: float | None = None
|
||||
avg_logprob: float
|
||||
"""Average logprob of the segment.
|
||||
|
||||
If the value is lower than -1, consider the logprobs failed.
|
||||
"""
|
||||
|
||||
compression_ratio: float | None = None
|
||||
compression_ratio: float
|
||||
"""Compression ratio of the segment.
|
||||
|
||||
If the value is greater than 2.4, consider the compression failed.
|
||||
|
||||
@@ -4,6 +4,7 @@ import asyncio
|
||||
import io
|
||||
import math
|
||||
import time
|
||||
import zlib
|
||||
from collections.abc import AsyncGenerator, Callable
|
||||
from functools import cached_property
|
||||
from typing import Literal, TypeAlias, TypeVar, cast
|
||||
@@ -38,6 +39,7 @@ from vllm.entrypoints.openai.translations.protocol import (
|
||||
from vllm.exceptions import VLLMValidationError
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import FlatLogprobs, Logprob
|
||||
from vllm.model_executor.models import SupportsTranscription, supports_transcription
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
@@ -317,6 +319,7 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
def _get_verbose_segments(
|
||||
self,
|
||||
tokens: tuple,
|
||||
log_probs: FlatLogprobs | list[dict[int, Logprob]],
|
||||
request: SpeechToTextRequest,
|
||||
segment_class: type[SpeechToTextSegment],
|
||||
start_time: float = 0,
|
||||
@@ -329,8 +332,7 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
If the tokens do not include timestamp information,
|
||||
the segments may not be generated correctly.
|
||||
|
||||
Note: Fields like avg_logprob, compression_ratio,
|
||||
and no_speech_prob are not supported
|
||||
Note: No_speech_prob field is not supported
|
||||
in this implementation and will be None. See docs for details.
|
||||
"""
|
||||
BASE_OFFSET = 0.02
|
||||
@@ -344,17 +346,17 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
|
||||
if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token:
|
||||
tokens_with_start = tokens_with_start + (tokens_with_start[-1],)
|
||||
for idx, token in enumerate(tokens_with_start):
|
||||
avg_logprob = 0.0
|
||||
for idx in range(1, len(tokens_with_start)):
|
||||
# Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted.
|
||||
# If the ordering is violated, this slicing may produce incorrect results.
|
||||
if (
|
||||
token >= init_token
|
||||
and idx != 0
|
||||
and tokens_with_start[idx - 1] >= init_token
|
||||
):
|
||||
token = tokens_with_start[idx]
|
||||
if token >= init_token and tokens_with_start[idx - 1] >= init_token:
|
||||
sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx]
|
||||
start_timestamp = sliced_timestamp_tokens[0] - init_token
|
||||
end_timestamp = sliced_timestamp_tokens[-1] - init_token
|
||||
text = self.tokenizer.decode(sliced_timestamp_tokens[1:-1])
|
||||
text_bytes = text.encode("utf-8")
|
||||
|
||||
casting_segment = cast(
|
||||
SpeechToTextSegment,
|
||||
@@ -364,12 +366,22 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
start=start_time + BASE_OFFSET * start_timestamp,
|
||||
end=start_time + BASE_OFFSET * end_timestamp,
|
||||
temperature=request.temperature,
|
||||
text=self.tokenizer.decode(sliced_timestamp_tokens[1:-1]),
|
||||
text=text,
|
||||
# The compression ratio measures
|
||||
# how compressible the generated text is.
|
||||
# A higher ratio indicates more repetitive content,
|
||||
# which is a strong sign of hallucination in outputs.
|
||||
compression_ratio=len(text_bytes)
|
||||
/ len(zlib.compress(text_bytes)),
|
||||
tokens=sliced_timestamp_tokens[1:-1],
|
||||
avg_logprob=avg_logprob / (idx - last_timestamp_start),
|
||||
),
|
||||
)
|
||||
segments.append(casting_segment)
|
||||
last_timestamp_start = idx
|
||||
avg_logprob = 0
|
||||
else:
|
||||
avg_logprob += log_probs[idx - 1][token].logprob
|
||||
return segments
|
||||
|
||||
async def _create_speech_to_text(
|
||||
@@ -443,6 +455,8 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
sampling_params = request.to_sampling_params(
|
||||
default_max_tokens, self.default_sampling_params
|
||||
)
|
||||
if request.response_format == "verbose_json":
|
||||
sampling_params.logprobs = 1
|
||||
|
||||
self._log_inputs(
|
||||
request_id,
|
||||
@@ -490,12 +504,14 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
)
|
||||
async for op in result_generator:
|
||||
if request.response_format == "verbose_json":
|
||||
assert op.outputs[0].logprobs
|
||||
segments: list[SpeechToTextSegment] = (
|
||||
self._get_verbose_segments(
|
||||
tokens=tuple(op.outputs[0].token_ids),
|
||||
segment_class=segment_class,
|
||||
request=request,
|
||||
start_time=start_time,
|
||||
log_probs=op.outputs[0].logprobs,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user