[Bugfix] Fix vLLM UsageInfo and logprobs None AssertionError with empty token_ids (#9034)
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
This commit is contained in:
@@ -435,6 +435,12 @@ class OpenAIServingChat(OpenAIServing):
|
||||
logprobs = None
|
||||
|
||||
delta_text = output.text
|
||||
|
||||
if not delta_text and not output.token_ids and \
|
||||
not previous_num_tokens[i]:
|
||||
# Chunked prefill case, don't return empty chunks
|
||||
continue
|
||||
|
||||
delta_message: Optional[DeltaMessage]
|
||||
|
||||
# handle streaming deltas for tools with named tool_choice
|
||||
|
||||
@@ -274,8 +274,6 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
|
||||
for output in res.outputs:
|
||||
i = output.index + prompt_idx * num_choices
|
||||
# TODO(simon): optimize the performance by avoiding full
|
||||
# text O(n^2) sending.
|
||||
|
||||
assert request.max_tokens is not None
|
||||
if request.echo and request.max_tokens == 0:
|
||||
@@ -307,6 +305,11 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
delta_token_ids = output.token_ids
|
||||
out_logprobs = output.logprobs
|
||||
|
||||
if not delta_text and not delta_token_ids \
|
||||
and not previous_num_tokens[i]:
|
||||
# Chunked prefill case, don't return empty chunks
|
||||
continue
|
||||
|
||||
if request.logprobs is not None:
|
||||
assert out_logprobs is not None, (
|
||||
"Did not output logprobs")
|
||||
|
||||
Reference in New Issue
Block a user