From 9b77bb790dd6d833a9b31814ba00f48b5fa47afb Mon Sep 17 00:00:00 2001 From: sangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com> Date: Sat, 24 Jan 2026 01:35:13 +0900 Subject: [PATCH] [Frontend] add logprob, compression_rate to 'verbose_json' features (#31059) Signed-off-by: sangbumlikeagod Signed-off-by: sangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com> --- docs/serving/openai_compatible_server.md | 2 +- .../test_transcription_validation_whisper.py | 2 ++ .../openai/translations/protocol.py | 8 ++--- .../openai/translations/speech_to_text.py | 34 ++++++++++++++----- 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 438624d91..85d591e3d 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -521,7 +521,7 @@ For `verbose_json` response format: ] } ``` -Currently “verbose_json” response format doesn’t support avg_logprob, compression_ratio, no_speech_prob. +Currently “verbose_json” response format doesn’t support no_speech_prob. #### Extra Parameters diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py index 8bf729c51..67e358ffa 100644 --- a/tests/entrypoints/openai/test_transcription_validation_whisper.py +++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py @@ -244,6 +244,8 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client): ) assert transcription.segments is not None assert len(transcription.segments) > 0 + assert transcription.segments[0].avg_logprob is not None + assert transcription.segments[0].compression_ratio is not None @pytest.mark.asyncio diff --git a/vllm/entrypoints/openai/translations/protocol.py b/vllm/entrypoints/openai/translations/protocol.py index a3c36c107..978113e6a 100644 --- a/vllm/entrypoints/openai/translations/protocol.py +++ b/vllm/entrypoints/openai/translations/protocol.py @@ -271,13 +271,13 @@ class TranscriptionSegment(OpenAIBaseModel): id: int """Unique identifier of the segment.""" - avg_logprob: float | None = None + avg_logprob: float """Average logprob of the segment. If the value is lower than -1, consider the logprobs failed. """ - compression_ratio: float | None = None + compression_ratio: float """Compression ratio of the segment. If the value is greater than 2.4, consider the compression failed. @@ -487,13 +487,13 @@ class TranslationSegment(OpenAIBaseModel): id: int """Unique identifier of the segment.""" - avg_logprob: float | None = None + avg_logprob: float """Average logprob of the segment. If the value is lower than -1, consider the logprobs failed. """ - compression_ratio: float | None = None + compression_ratio: float """Compression ratio of the segment. If the value is greater than 2.4, consider the compression failed. diff --git a/vllm/entrypoints/openai/translations/speech_to_text.py b/vllm/entrypoints/openai/translations/speech_to_text.py index 48086c030..1e934aab8 100644 --- a/vllm/entrypoints/openai/translations/speech_to_text.py +++ b/vllm/entrypoints/openai/translations/speech_to_text.py @@ -4,6 +4,7 @@ import asyncio import io import math import time +import zlib from collections.abc import AsyncGenerator, Callable from functools import cached_property from typing import Literal, TypeAlias, TypeVar, cast @@ -38,6 +39,7 @@ from vllm.entrypoints.openai.translations.protocol import ( from vllm.exceptions import VLLMValidationError from vllm.inputs.data import PromptType from vllm.logger import init_logger +from vllm.logprobs import FlatLogprobs, Logprob from vllm.model_executor.models import SupportsTranscription, supports_transcription from vllm.outputs import RequestOutput from vllm.tokenizers import get_tokenizer @@ -317,6 +319,7 @@ class OpenAISpeechToText(OpenAIServing): def _get_verbose_segments( self, tokens: tuple, + log_probs: FlatLogprobs | list[dict[int, Logprob]], request: SpeechToTextRequest, segment_class: type[SpeechToTextSegment], start_time: float = 0, @@ -329,8 +332,7 @@ class OpenAISpeechToText(OpenAIServing): If the tokens do not include timestamp information, the segments may not be generated correctly. - Note: Fields like avg_logprob, compression_ratio, - and no_speech_prob are not supported + Note: No_speech_prob field is not supported in this implementation and will be None. See docs for details. """ BASE_OFFSET = 0.02 @@ -344,17 +346,17 @@ class OpenAISpeechToText(OpenAIServing): if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token: tokens_with_start = tokens_with_start + (tokens_with_start[-1],) - for idx, token in enumerate(tokens_with_start): + avg_logprob = 0.0 + for idx in range(1, len(tokens_with_start)): # Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted. # If the ordering is violated, this slicing may produce incorrect results. - if ( - token >= init_token - and idx != 0 - and tokens_with_start[idx - 1] >= init_token - ): + token = tokens_with_start[idx] + if token >= init_token and tokens_with_start[idx - 1] >= init_token: sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx] start_timestamp = sliced_timestamp_tokens[0] - init_token end_timestamp = sliced_timestamp_tokens[-1] - init_token + text = self.tokenizer.decode(sliced_timestamp_tokens[1:-1]) + text_bytes = text.encode("utf-8") casting_segment = cast( SpeechToTextSegment, @@ -364,12 +366,22 @@ class OpenAISpeechToText(OpenAIServing): start=start_time + BASE_OFFSET * start_timestamp, end=start_time + BASE_OFFSET * end_timestamp, temperature=request.temperature, - text=self.tokenizer.decode(sliced_timestamp_tokens[1:-1]), + text=text, + # The compression ratio measures + # how compressible the generated text is. + # A higher ratio indicates more repetitive content, + # which is a strong sign of hallucination in outputs. + compression_ratio=len(text_bytes) + / len(zlib.compress(text_bytes)), tokens=sliced_timestamp_tokens[1:-1], + avg_logprob=avg_logprob / (idx - last_timestamp_start), ), ) segments.append(casting_segment) last_timestamp_start = idx + avg_logprob = 0 + else: + avg_logprob += log_probs[idx - 1][token].logprob return segments async def _create_speech_to_text( @@ -443,6 +455,8 @@ class OpenAISpeechToText(OpenAIServing): sampling_params = request.to_sampling_params( default_max_tokens, self.default_sampling_params ) + if request.response_format == "verbose_json": + sampling_params.logprobs = 1 self._log_inputs( request_id, @@ -490,12 +504,14 @@ class OpenAISpeechToText(OpenAIServing): ) async for op in result_generator: if request.response_format == "verbose_json": + assert op.outputs[0].logprobs segments: list[SpeechToTextSegment] = ( self._get_verbose_segments( tokens=tuple(op.outputs[0].token_ids), segment_class=segment_class, request=request, start_time=start_time, + log_probs=op.outputs[0].logprobs, ) )