From 9b77bb790dd6d833a9b31814ba00f48b5fa47afb Mon Sep 17 00:00:00 2001
From: sangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com>
Date: Sat, 24 Jan 2026 01:35:13 +0900
Subject: [PATCH] [Frontend] add logprob, compression_rate to 'verbose_json'
 features (#31059)

Signed-off-by: sangbumlikeagod <oironese@naver.com>
Signed-off-by: sangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com>
---
 docs/serving/openai_compatible_server.md      |  2 +-
 .../test_transcription_validation_whisper.py  |  2 ++
 .../openai/translations/protocol.py           |  8 ++---
 .../openai/translations/speech_to_text.py     | 34 ++++++++++++++-----
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 438624d91..85d591e3d 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -521,7 +521,7 @@ For `verbose_json` response format:
       ]
     }
     ```
-Currently “verbose_json” response format doesn’t support avg_logprob, compression_ratio, no_speech_prob.
+Currently “verbose_json” response format doesn’t support no_speech_prob.
 
 #### Extra Parameters
 
diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py
index 8bf729c51..67e358ffa 100644
--- a/tests/entrypoints/openai/test_transcription_validation_whisper.py
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -244,6 +244,8 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
     )
     assert transcription.segments is not None
     assert len(transcription.segments) > 0
+    assert transcription.segments[0].avg_logprob is not None
+    assert transcription.segments[0].compression_ratio is not None
 
 
 @pytest.mark.asyncio
diff --git a/vllm/entrypoints/openai/translations/protocol.py b/vllm/entrypoints/openai/translations/protocol.py
index a3c36c107..978113e6a 100644
--- a/vllm/entrypoints/openai/translations/protocol.py
+++ b/vllm/entrypoints/openai/translations/protocol.py
@@ -271,13 +271,13 @@ class TranscriptionSegment(OpenAIBaseModel):
     id: int
     """Unique identifier of the segment."""
 
-    avg_logprob: float | None = None
+    avg_logprob: float
     """Average logprob of the segment.
 
     If the value is lower than -1, consider the logprobs failed.
     """
 
-    compression_ratio: float | None = None
+    compression_ratio: float
     """Compression ratio of the segment.
 
     If the value is greater than 2.4, consider the compression failed.
@@ -487,13 +487,13 @@ class TranslationSegment(OpenAIBaseModel):
     id: int
     """Unique identifier of the segment."""
 
-    avg_logprob: float | None = None
+    avg_logprob: float
     """Average logprob of the segment.
 
     If the value is lower than -1, consider the logprobs failed.
     """
 
-    compression_ratio: float | None = None
+    compression_ratio: float
     """Compression ratio of the segment.
 
     If the value is greater than 2.4, consider the compression failed.
diff --git a/vllm/entrypoints/openai/translations/speech_to_text.py b/vllm/entrypoints/openai/translations/speech_to_text.py
index 48086c030..1e934aab8 100644
--- a/vllm/entrypoints/openai/translations/speech_to_text.py
+++ b/vllm/entrypoints/openai/translations/speech_to_text.py
@@ -4,6 +4,7 @@ import asyncio
 import io
 import math
 import time
+import zlib
 from collections.abc import AsyncGenerator, Callable
 from functools import cached_property
 from typing import Literal, TypeAlias, TypeVar, cast
@@ -38,6 +39,7 @@ from vllm.entrypoints.openai.translations.protocol import (
 from vllm.exceptions import VLLMValidationError
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
+from vllm.logprobs import FlatLogprobs, Logprob
 from vllm.model_executor.models import SupportsTranscription, supports_transcription
 from vllm.outputs import RequestOutput
 from vllm.tokenizers import get_tokenizer
@@ -317,6 +319,7 @@ class OpenAISpeechToText(OpenAIServing):
     def _get_verbose_segments(
         self,
         tokens: tuple,
+        log_probs: FlatLogprobs | list[dict[int, Logprob]],
         request: SpeechToTextRequest,
         segment_class: type[SpeechToTextSegment],
         start_time: float = 0,
@@ -329,8 +332,7 @@ class OpenAISpeechToText(OpenAIServing):
         If the tokens do not include timestamp information,
         the segments may not be generated correctly.
 
-        Note: Fields like avg_logprob, compression_ratio,
-        and no_speech_prob are not supported
+        Note: No_speech_prob field is not supported
         in this implementation and will be None. See docs for details.
         """
         BASE_OFFSET = 0.02
@@ -344,17 +346,17 @@ class OpenAISpeechToText(OpenAIServing):
 
         if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token:
             tokens_with_start = tokens_with_start + (tokens_with_start[-1],)
-        for idx, token in enumerate(tokens_with_start):
+        avg_logprob = 0.0
+        for idx in range(1, len(tokens_with_start)):
             # Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted.
             # If the ordering is violated, this slicing may produce incorrect results.
-            if (
-                token >= init_token
-                and idx != 0
-                and tokens_with_start[idx - 1] >= init_token
-            ):
+            token = tokens_with_start[idx]
+            if token >= init_token and tokens_with_start[idx - 1] >= init_token:
                 sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx]
                 start_timestamp = sliced_timestamp_tokens[0] - init_token
                 end_timestamp = sliced_timestamp_tokens[-1] - init_token
+                text = self.tokenizer.decode(sliced_timestamp_tokens[1:-1])
+                text_bytes = text.encode("utf-8")
 
                 casting_segment = cast(
                     SpeechToTextSegment,
@@ -364,12 +366,22 @@ class OpenAISpeechToText(OpenAIServing):
                         start=start_time + BASE_OFFSET * start_timestamp,
                         end=start_time + BASE_OFFSET * end_timestamp,
                         temperature=request.temperature,
-                        text=self.tokenizer.decode(sliced_timestamp_tokens[1:-1]),
+                        text=text,
+                        # The compression ratio measures
+                        # how compressible the generated text is.
+                        # A higher ratio indicates more repetitive content,
+                        # which is a strong sign of hallucination in outputs.
+                        compression_ratio=len(text_bytes)
+                        / len(zlib.compress(text_bytes)),
                         tokens=sliced_timestamp_tokens[1:-1],
+                        avg_logprob=avg_logprob / (idx - last_timestamp_start),
                     ),
                 )
                 segments.append(casting_segment)
                 last_timestamp_start = idx
+                avg_logprob = 0
+            else:
+                avg_logprob += log_probs[idx - 1][token].logprob
         return segments
 
     async def _create_speech_to_text(
@@ -443,6 +455,8 @@ class OpenAISpeechToText(OpenAIServing):
             sampling_params = request.to_sampling_params(
                 default_max_tokens, self.default_sampling_params
             )
+            if request.response_format == "verbose_json":
+                sampling_params.logprobs = 1
 
             self._log_inputs(
                 request_id,
@@ -490,12 +504,14 @@ class OpenAISpeechToText(OpenAIServing):
                 )
                 async for op in result_generator:
                     if request.response_format == "verbose_json":
+                        assert op.outputs[0].logprobs
                         segments: list[SpeechToTextSegment] = (
                             self._get_verbose_segments(
                                 tokens=tuple(op.outputs[0].token_ids),
                                 segment_class=segment_class,
                                 request=request,
                                 start_time=start_time,
+                                log_probs=op.outputs[0].logprobs,
                             )
                         )