[Frontend] Add Usage data in each chunk for chat_serving. #6540 (#6652)

This commit is contained in:
Yehoshua Cohen
2024-07-23 21:41:55 +03:00
committed by GitHub
parent 0eb0757bef
commit 58f53034ad
2 changed files with 78 additions and 12 deletions

View File

@@ -247,7 +247,15 @@ class OpenAIServingChat(OpenAIServing):
model=model_name)
if (request.stream_options
and request.stream_options.include_usage):
chunk.usage = None
if (request.stream_options.continuous_usage_stats):
prompt_tokens = len(res.prompt_token_ids)
usage = UsageInfo(prompt_tokens=prompt_tokens,
completion_tokens=0,
total_tokens=prompt_tokens)
chunk.usage = usage
else:
chunk.usage = None
data = chunk.model_dump_json(exclude_unset=True)
yield f"data: {data}\n\n"
@@ -277,7 +285,18 @@ class OpenAIServingChat(OpenAIServing):
model=model_name)
if (request.stream_options and
request.stream_options.include_usage):
chunk.usage = None
if (request.stream_options.
continuous_usage_stats):
prompt_tokens = len(
res.prompt_token_ids)
usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=0,
total_tokens=prompt_tokens)
chunk.usage = usage
else:
chunk.usage = None
data = chunk.model_dump_json(
exclude_unset=True)
yield f"data: {data}\n\n"
@@ -336,7 +355,19 @@ class OpenAIServingChat(OpenAIServing):
model=model_name)
if (request.stream_options
and request.stream_options.include_usage):
chunk.usage = None
if (request.stream_options.continuous_usage_stats):
prompt_tokens = len(res.prompt_token_ids)
completion_tokens = len(output.token_ids)
usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens +
completion_tokens,
)
chunk.usage = usage
else:
chunk.usage = None
data = chunk.model_dump_json(exclude_unset=True)
yield f"data: {data}\n\n"
else:
@@ -356,7 +387,18 @@ class OpenAIServingChat(OpenAIServing):
model=model_name)
if (request.stream_options
and request.stream_options.include_usage):
chunk.usage = None
if (request.stream_options.continuous_usage_stats):
prompt_tokens = len(res.prompt_token_ids)
completion_tokens = len(output.token_ids)
usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens +
completion_tokens,
)
chunk.usage = usage
else:
chunk.usage = None
data = chunk.model_dump_json(exclude_unset=True)
yield f"data: {data}\n\n"
finish_reason_sent[i] = True