[Frontend] add logprob, compression_rate to 'verbose_json' features (#31059)

Signed-off-by: sangbumlikeagod <oironese@naver.com> Signed-off-by: sangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com>
2026-01-24 01:35:13 +09:00
parent 305e53ade8
commit 9b77bb790d
4 changed files with 32 additions and 14 deletions
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -521,7 +521,7 @@ For `verbose_json` response format:
      ]
    }
    ```
-Currently “verbose_json” response format doesn’t support avg_logprob, compression_ratio, no_speech_prob.
+Currently “verbose_json” response format doesn’t support no_speech_prob.
 #### Extra Parameters
--- a/tests/entrypoints/openai/test_transcription_validation_whisper.py
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -244,6 +244,8 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
    )
    assert transcription.segments is not None
    assert len(transcription.segments) > 0
    assert transcription.segments[0].avg_logprob is not None
    assert transcription.segments[0].compression_ratio is not None
@pytest.mark.asyncio
--- a/vllm/entrypoints/openai/translations/protocol.py
+++ b/vllm/entrypoints/openai/translations/protocol.py
@@ -271,13 +271,13 @@ class TranscriptionSegment(OpenAIBaseModel):
    id: int
    """Unique identifier of the segment."""
-    avg_logprob: float | None = None
+    avg_logprob: float
    """Average logprob of the segment.
    If the value is lower than -1, consider the logprobs failed.
    """
-    compression_ratio: float | None = None
+    compression_ratio: float
    """Compression ratio of the segment.
    If the value is greater than 2.4, consider the compression failed.
@@ -487,13 +487,13 @@ class TranslationSegment(OpenAIBaseModel):
    id: int
    """Unique identifier of the segment."""
-    avg_logprob: float | None = None
+    avg_logprob: float
    """Average logprob of the segment.
    If the value is lower than -1, consider the logprobs failed.
    """
-    compression_ratio: float | None = None
+    compression_ratio: float
    """Compression ratio of the segment.
    If the value is greater than 2.4, consider the compression failed.
--- a/vllm/entrypoints/openai/translations/speech_to_text.py
+++ b/vllm/entrypoints/openai/translations/speech_to_text.py
@@ -4,6 +4,7 @@ import asyncio
 import io
 import math
 import time
 import zlib
 from collections.abc import AsyncGenerator, Callable
 from functools import cached_property
 from typing import Literal, TypeAlias, TypeVar, cast
@@ -38,6 +39,7 @@ from vllm.entrypoints.openai.translations.protocol import (
 from vllm.exceptions import VLLMValidationError
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.logprobs import FlatLogprobs, Logprob
 from vllm.model_executor.models import SupportsTranscription, supports_transcription
 from vllm.outputs import RequestOutput
 from vllm.tokenizers import get_tokenizer
@@ -317,6 +319,7 @@ class OpenAISpeechToText(OpenAIServing):
    def _get_verbose_segments(
        self,
        tokens: tuple,
        log_probs: FlatLogprobs | list[dict[int, Logprob]],
        request: SpeechToTextRequest,
        segment_class: type[SpeechToTextSegment],
        start_time: float = 0,
@@ -329,8 +332,7 @@ class OpenAISpeechToText(OpenAIServing):
        If the tokens do not include timestamp information,
        the segments may not be generated correctly.
-        Note: Fields like avg_logprob, compression_ratio,
+        Note: No_speech_prob field is not supported
        and no_speech_prob are not supported
        in this implementation and will be None. See docs for details.
        """
        BASE_OFFSET = 0.02
@@ -344,17 +346,17 @@ class OpenAISpeechToText(OpenAIServing):
        if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token:
            tokens_with_start = tokens_with_start + (tokens_with_start[-1],)
-        for idx, token in enumerate(tokens_with_start):
+        avg_logprob = 0.0
        for idx in range(1, len(tokens_with_start)):
            # Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted.
            # If the ordering is violated, this slicing may produce incorrect results.
-            if (
+            token = tokens_with_start[idx]
-                token >= init_token
+            if token >= init_token and tokens_with_start[idx - 1] >= init_token:
                and idx != 0
                and tokens_with_start[idx - 1] >= init_token
            ):
                sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx]
                start_timestamp = sliced_timestamp_tokens[0] - init_token
                end_timestamp = sliced_timestamp_tokens[-1] - init_token
                text = self.tokenizer.decode(sliced_timestamp_tokens[1:-1])
                text_bytes = text.encode("utf-8")
                casting_segment = cast(
                    SpeechToTextSegment,
@@ -364,12 +366,22 @@ class OpenAISpeechToText(OpenAIServing):
                        start=start_time + BASE_OFFSET * start_timestamp,
                        end=start_time + BASE_OFFSET * end_timestamp,
                        temperature=request.temperature,
-                        text=self.tokenizer.decode(sliced_timestamp_tokens[1:-1]),
+                        text=text,
                        # The compression ratio measures
                        # how compressible the generated text is.
                        # A higher ratio indicates more repetitive content,
                        # which is a strong sign of hallucination in outputs.
                        compression_ratio=len(text_bytes)
                        / len(zlib.compress(text_bytes)),
                        tokens=sliced_timestamp_tokens[1:-1],
                        avg_logprob=avg_logprob / (idx - last_timestamp_start),
                    ),
                )
                segments.append(casting_segment)
                last_timestamp_start = idx
                avg_logprob = 0
            else:
                avg_logprob += log_probs[idx - 1][token].logprob
        return segments
    async def _create_speech_to_text(
@@ -443,6 +455,8 @@ class OpenAISpeechToText(OpenAIServing):
            sampling_params = request.to_sampling_params(
                default_max_tokens, self.default_sampling_params
            )
            if request.response_format == "verbose_json":
                sampling_params.logprobs = 1
            self._log_inputs(
                request_id,
@@ -490,12 +504,14 @@ class OpenAISpeechToText(OpenAIServing):
                )
                async for op in result_generator:
                    if request.response_format == "verbose_json":
                        assert op.outputs[0].logprobs
                        segments: list[SpeechToTextSegment] = (
                            self._get_verbose_segments(
                                tokens=tuple(op.outputs[0].token_ids),
                                segment_class=segment_class,
                                request=request,
                                start_time=start_time,
                                log_probs=op.outputs[0].logprobs,
                            )
                        )