[Frontend] add logprob, compression_rate to 'verbose_json' features (#31059)

Signed-off-by: sangbumlikeagod <oironese@naver.com>
Signed-off-by: sangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com>
This commit is contained in:
sangbumlikeagod
2026-01-24 01:35:13 +09:00
committed by GitHub
parent 305e53ade8
commit 9b77bb790d
4 changed files with 32 additions and 14 deletions

View File

@@ -521,7 +521,7 @@ For `verbose_json` response format:
] ]
} }
``` ```
Currently “verbose_json” response format doesnt support avg_logprob, compression_ratio, no_speech_prob. Currently “verbose_json” response format doesnt support no_speech_prob.
#### Extra Parameters #### Extra Parameters

View File

@@ -244,6 +244,8 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
) )
assert transcription.segments is not None assert transcription.segments is not None
assert len(transcription.segments) > 0 assert len(transcription.segments) > 0
assert transcription.segments[0].avg_logprob is not None
assert transcription.segments[0].compression_ratio is not None
@pytest.mark.asyncio @pytest.mark.asyncio

View File

@@ -271,13 +271,13 @@ class TranscriptionSegment(OpenAIBaseModel):
id: int id: int
"""Unique identifier of the segment.""" """Unique identifier of the segment."""
avg_logprob: float | None = None avg_logprob: float
"""Average logprob of the segment. """Average logprob of the segment.
If the value is lower than -1, consider the logprobs failed. If the value is lower than -1, consider the logprobs failed.
""" """
compression_ratio: float | None = None compression_ratio: float
"""Compression ratio of the segment. """Compression ratio of the segment.
If the value is greater than 2.4, consider the compression failed. If the value is greater than 2.4, consider the compression failed.
@@ -487,13 +487,13 @@ class TranslationSegment(OpenAIBaseModel):
id: int id: int
"""Unique identifier of the segment.""" """Unique identifier of the segment."""
avg_logprob: float | None = None avg_logprob: float
"""Average logprob of the segment. """Average logprob of the segment.
If the value is lower than -1, consider the logprobs failed. If the value is lower than -1, consider the logprobs failed.
""" """
compression_ratio: float | None = None compression_ratio: float
"""Compression ratio of the segment. """Compression ratio of the segment.
If the value is greater than 2.4, consider the compression failed. If the value is greater than 2.4, consider the compression failed.

View File

@@ -4,6 +4,7 @@ import asyncio
import io import io
import math import math
import time import time
import zlib
from collections.abc import AsyncGenerator, Callable from collections.abc import AsyncGenerator, Callable
from functools import cached_property from functools import cached_property
from typing import Literal, TypeAlias, TypeVar, cast from typing import Literal, TypeAlias, TypeVar, cast
@@ -38,6 +39,7 @@ from vllm.entrypoints.openai.translations.protocol import (
from vllm.exceptions import VLLMValidationError from vllm.exceptions import VLLMValidationError
from vllm.inputs.data import PromptType from vllm.inputs.data import PromptType
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.logprobs import FlatLogprobs, Logprob
from vllm.model_executor.models import SupportsTranscription, supports_transcription from vllm.model_executor.models import SupportsTranscription, supports_transcription
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.tokenizers import get_tokenizer from vllm.tokenizers import get_tokenizer
@@ -317,6 +319,7 @@ class OpenAISpeechToText(OpenAIServing):
def _get_verbose_segments( def _get_verbose_segments(
self, self,
tokens: tuple, tokens: tuple,
log_probs: FlatLogprobs | list[dict[int, Logprob]],
request: SpeechToTextRequest, request: SpeechToTextRequest,
segment_class: type[SpeechToTextSegment], segment_class: type[SpeechToTextSegment],
start_time: float = 0, start_time: float = 0,
@@ -329,8 +332,7 @@ class OpenAISpeechToText(OpenAIServing):
If the tokens do not include timestamp information, If the tokens do not include timestamp information,
the segments may not be generated correctly. the segments may not be generated correctly.
Note: Fields like avg_logprob, compression_ratio, Note: No_speech_prob field is not supported
and no_speech_prob are not supported
in this implementation and will be None. See docs for details. in this implementation and will be None. See docs for details.
""" """
BASE_OFFSET = 0.02 BASE_OFFSET = 0.02
@@ -344,17 +346,17 @@ class OpenAISpeechToText(OpenAIServing):
if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token: if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token:
tokens_with_start = tokens_with_start + (tokens_with_start[-1],) tokens_with_start = tokens_with_start + (tokens_with_start[-1],)
for idx, token in enumerate(tokens_with_start): avg_logprob = 0.0
for idx in range(1, len(tokens_with_start)):
# Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted. # Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted.
# If the ordering is violated, this slicing may produce incorrect results. # If the ordering is violated, this slicing may produce incorrect results.
if ( token = tokens_with_start[idx]
token >= init_token if token >= init_token and tokens_with_start[idx - 1] >= init_token:
and idx != 0
and tokens_with_start[idx - 1] >= init_token
):
sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx] sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx]
start_timestamp = sliced_timestamp_tokens[0] - init_token start_timestamp = sliced_timestamp_tokens[0] - init_token
end_timestamp = sliced_timestamp_tokens[-1] - init_token end_timestamp = sliced_timestamp_tokens[-1] - init_token
text = self.tokenizer.decode(sliced_timestamp_tokens[1:-1])
text_bytes = text.encode("utf-8")
casting_segment = cast( casting_segment = cast(
SpeechToTextSegment, SpeechToTextSegment,
@@ -364,12 +366,22 @@ class OpenAISpeechToText(OpenAIServing):
start=start_time + BASE_OFFSET * start_timestamp, start=start_time + BASE_OFFSET * start_timestamp,
end=start_time + BASE_OFFSET * end_timestamp, end=start_time + BASE_OFFSET * end_timestamp,
temperature=request.temperature, temperature=request.temperature,
text=self.tokenizer.decode(sliced_timestamp_tokens[1:-1]), text=text,
# The compression ratio measures
# how compressible the generated text is.
# A higher ratio indicates more repetitive content,
# which is a strong sign of hallucination in outputs.
compression_ratio=len(text_bytes)
/ len(zlib.compress(text_bytes)),
tokens=sliced_timestamp_tokens[1:-1], tokens=sliced_timestamp_tokens[1:-1],
avg_logprob=avg_logprob / (idx - last_timestamp_start),
), ),
) )
segments.append(casting_segment) segments.append(casting_segment)
last_timestamp_start = idx last_timestamp_start = idx
avg_logprob = 0
else:
avg_logprob += log_probs[idx - 1][token].logprob
return segments return segments
async def _create_speech_to_text( async def _create_speech_to_text(
@@ -443,6 +455,8 @@ class OpenAISpeechToText(OpenAIServing):
sampling_params = request.to_sampling_params( sampling_params = request.to_sampling_params(
default_max_tokens, self.default_sampling_params default_max_tokens, self.default_sampling_params
) )
if request.response_format == "verbose_json":
sampling_params.logprobs = 1
self._log_inputs( self._log_inputs(
request_id, request_id,
@@ -490,12 +504,14 @@ class OpenAISpeechToText(OpenAIServing):
) )
async for op in result_generator: async for op in result_generator:
if request.response_format == "verbose_json": if request.response_format == "verbose_json":
assert op.outputs[0].logprobs
segments: list[SpeechToTextSegment] = ( segments: list[SpeechToTextSegment] = (
self._get_verbose_segments( self._get_verbose_segments(
tokens=tuple(op.outputs[0].token_ids), tokens=tuple(op.outputs[0].token_ids),
segment_class=segment_class, segment_class=segment_class,
request=request, request=request,
start_time=start_time, start_time=start_time,
log_probs=op.outputs[0].logprobs,
) )
) )