[Frontend] add logprob, compression_rate to 'verbose_json' features (#31059)
Signed-off-by: sangbumlikeagod <oironese@naver.com> Signed-off-by: sangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com>
This commit is contained in:
@@ -521,7 +521,7 @@ For `verbose_json` response format:
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
Currently “verbose_json” response format doesn’t support avg_logprob, compression_ratio, no_speech_prob.
|
Currently “verbose_json” response format doesn’t support no_speech_prob.
|
||||||
|
|
||||||
#### Extra Parameters
|
#### Extra Parameters
|
||||||
|
|
||||||
|
|||||||
@@ -244,6 +244,8 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
|
|||||||
)
|
)
|
||||||
assert transcription.segments is not None
|
assert transcription.segments is not None
|
||||||
assert len(transcription.segments) > 0
|
assert len(transcription.segments) > 0
|
||||||
|
assert transcription.segments[0].avg_logprob is not None
|
||||||
|
assert transcription.segments[0].compression_ratio is not None
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|||||||
@@ -271,13 +271,13 @@ class TranscriptionSegment(OpenAIBaseModel):
|
|||||||
id: int
|
id: int
|
||||||
"""Unique identifier of the segment."""
|
"""Unique identifier of the segment."""
|
||||||
|
|
||||||
avg_logprob: float | None = None
|
avg_logprob: float
|
||||||
"""Average logprob of the segment.
|
"""Average logprob of the segment.
|
||||||
|
|
||||||
If the value is lower than -1, consider the logprobs failed.
|
If the value is lower than -1, consider the logprobs failed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
compression_ratio: float | None = None
|
compression_ratio: float
|
||||||
"""Compression ratio of the segment.
|
"""Compression ratio of the segment.
|
||||||
|
|
||||||
If the value is greater than 2.4, consider the compression failed.
|
If the value is greater than 2.4, consider the compression failed.
|
||||||
@@ -487,13 +487,13 @@ class TranslationSegment(OpenAIBaseModel):
|
|||||||
id: int
|
id: int
|
||||||
"""Unique identifier of the segment."""
|
"""Unique identifier of the segment."""
|
||||||
|
|
||||||
avg_logprob: float | None = None
|
avg_logprob: float
|
||||||
"""Average logprob of the segment.
|
"""Average logprob of the segment.
|
||||||
|
|
||||||
If the value is lower than -1, consider the logprobs failed.
|
If the value is lower than -1, consider the logprobs failed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
compression_ratio: float | None = None
|
compression_ratio: float
|
||||||
"""Compression ratio of the segment.
|
"""Compression ratio of the segment.
|
||||||
|
|
||||||
If the value is greater than 2.4, consider the compression failed.
|
If the value is greater than 2.4, consider the compression failed.
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import asyncio
|
|||||||
import io
|
import io
|
||||||
import math
|
import math
|
||||||
import time
|
import time
|
||||||
|
import zlib
|
||||||
from collections.abc import AsyncGenerator, Callable
|
from collections.abc import AsyncGenerator, Callable
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import Literal, TypeAlias, TypeVar, cast
|
from typing import Literal, TypeAlias, TypeVar, cast
|
||||||
@@ -38,6 +39,7 @@ from vllm.entrypoints.openai.translations.protocol import (
|
|||||||
from vllm.exceptions import VLLMValidationError
|
from vllm.exceptions import VLLMValidationError
|
||||||
from vllm.inputs.data import PromptType
|
from vllm.inputs.data import PromptType
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.logprobs import FlatLogprobs, Logprob
|
||||||
from vllm.model_executor.models import SupportsTranscription, supports_transcription
|
from vllm.model_executor.models import SupportsTranscription, supports_transcription
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.tokenizers import get_tokenizer
|
from vllm.tokenizers import get_tokenizer
|
||||||
@@ -317,6 +319,7 @@ class OpenAISpeechToText(OpenAIServing):
|
|||||||
def _get_verbose_segments(
|
def _get_verbose_segments(
|
||||||
self,
|
self,
|
||||||
tokens: tuple,
|
tokens: tuple,
|
||||||
|
log_probs: FlatLogprobs | list[dict[int, Logprob]],
|
||||||
request: SpeechToTextRequest,
|
request: SpeechToTextRequest,
|
||||||
segment_class: type[SpeechToTextSegment],
|
segment_class: type[SpeechToTextSegment],
|
||||||
start_time: float = 0,
|
start_time: float = 0,
|
||||||
@@ -329,8 +332,7 @@ class OpenAISpeechToText(OpenAIServing):
|
|||||||
If the tokens do not include timestamp information,
|
If the tokens do not include timestamp information,
|
||||||
the segments may not be generated correctly.
|
the segments may not be generated correctly.
|
||||||
|
|
||||||
Note: Fields like avg_logprob, compression_ratio,
|
Note: No_speech_prob field is not supported
|
||||||
and no_speech_prob are not supported
|
|
||||||
in this implementation and will be None. See docs for details.
|
in this implementation and will be None. See docs for details.
|
||||||
"""
|
"""
|
||||||
BASE_OFFSET = 0.02
|
BASE_OFFSET = 0.02
|
||||||
@@ -344,17 +346,17 @@ class OpenAISpeechToText(OpenAIServing):
|
|||||||
|
|
||||||
if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token:
|
if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token:
|
||||||
tokens_with_start = tokens_with_start + (tokens_with_start[-1],)
|
tokens_with_start = tokens_with_start + (tokens_with_start[-1],)
|
||||||
for idx, token in enumerate(tokens_with_start):
|
avg_logprob = 0.0
|
||||||
|
for idx in range(1, len(tokens_with_start)):
|
||||||
# Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted.
|
# Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted.
|
||||||
# If the ordering is violated, this slicing may produce incorrect results.
|
# If the ordering is violated, this slicing may produce incorrect results.
|
||||||
if (
|
token = tokens_with_start[idx]
|
||||||
token >= init_token
|
if token >= init_token and tokens_with_start[idx - 1] >= init_token:
|
||||||
and idx != 0
|
|
||||||
and tokens_with_start[idx - 1] >= init_token
|
|
||||||
):
|
|
||||||
sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx]
|
sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx]
|
||||||
start_timestamp = sliced_timestamp_tokens[0] - init_token
|
start_timestamp = sliced_timestamp_tokens[0] - init_token
|
||||||
end_timestamp = sliced_timestamp_tokens[-1] - init_token
|
end_timestamp = sliced_timestamp_tokens[-1] - init_token
|
||||||
|
text = self.tokenizer.decode(sliced_timestamp_tokens[1:-1])
|
||||||
|
text_bytes = text.encode("utf-8")
|
||||||
|
|
||||||
casting_segment = cast(
|
casting_segment = cast(
|
||||||
SpeechToTextSegment,
|
SpeechToTextSegment,
|
||||||
@@ -364,12 +366,22 @@ class OpenAISpeechToText(OpenAIServing):
|
|||||||
start=start_time + BASE_OFFSET * start_timestamp,
|
start=start_time + BASE_OFFSET * start_timestamp,
|
||||||
end=start_time + BASE_OFFSET * end_timestamp,
|
end=start_time + BASE_OFFSET * end_timestamp,
|
||||||
temperature=request.temperature,
|
temperature=request.temperature,
|
||||||
text=self.tokenizer.decode(sliced_timestamp_tokens[1:-1]),
|
text=text,
|
||||||
|
# The compression ratio measures
|
||||||
|
# how compressible the generated text is.
|
||||||
|
# A higher ratio indicates more repetitive content,
|
||||||
|
# which is a strong sign of hallucination in outputs.
|
||||||
|
compression_ratio=len(text_bytes)
|
||||||
|
/ len(zlib.compress(text_bytes)),
|
||||||
tokens=sliced_timestamp_tokens[1:-1],
|
tokens=sliced_timestamp_tokens[1:-1],
|
||||||
|
avg_logprob=avg_logprob / (idx - last_timestamp_start),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
segments.append(casting_segment)
|
segments.append(casting_segment)
|
||||||
last_timestamp_start = idx
|
last_timestamp_start = idx
|
||||||
|
avg_logprob = 0
|
||||||
|
else:
|
||||||
|
avg_logprob += log_probs[idx - 1][token].logprob
|
||||||
return segments
|
return segments
|
||||||
|
|
||||||
async def _create_speech_to_text(
|
async def _create_speech_to_text(
|
||||||
@@ -443,6 +455,8 @@ class OpenAISpeechToText(OpenAIServing):
|
|||||||
sampling_params = request.to_sampling_params(
|
sampling_params = request.to_sampling_params(
|
||||||
default_max_tokens, self.default_sampling_params
|
default_max_tokens, self.default_sampling_params
|
||||||
)
|
)
|
||||||
|
if request.response_format == "verbose_json":
|
||||||
|
sampling_params.logprobs = 1
|
||||||
|
|
||||||
self._log_inputs(
|
self._log_inputs(
|
||||||
request_id,
|
request_id,
|
||||||
@@ -490,12 +504,14 @@ class OpenAISpeechToText(OpenAIServing):
|
|||||||
)
|
)
|
||||||
async for op in result_generator:
|
async for op in result_generator:
|
||||||
if request.response_format == "verbose_json":
|
if request.response_format == "verbose_json":
|
||||||
|
assert op.outputs[0].logprobs
|
||||||
segments: list[SpeechToTextSegment] = (
|
segments: list[SpeechToTextSegment] = (
|
||||||
self._get_verbose_segments(
|
self._get_verbose_segments(
|
||||||
tokens=tuple(op.outputs[0].token_ids),
|
tokens=tuple(op.outputs[0].token_ids),
|
||||||
segment_class=segment_class,
|
segment_class=segment_class,
|
||||||
request=request,
|
request=request,
|
||||||
start_time=start_time,
|
start_time=start_time,
|
||||||
|
log_probs=op.outputs[0].logprobs,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user