[Frontend] Add per-request number of cached token stats (#10174)

This commit is contained in:
zifeitong
2024-11-12 08:42:28 -08:00
committed by GitHub
parent 176fcb1c71
commit 47db6ec831
9 changed files with 89 additions and 23 deletions

View File

@@ -78,6 +78,11 @@ def parse_args():
help="Port number for the Prometheus metrics server "
"(only needed if enable-metrics is set).",
)
parser.add_argument(
"--enable-prompt-tokens-details",
action='store_true',
default=False,
help="If set to True, enable prompt_tokens_details in usage.")
return parser.parse_args()
@@ -217,6 +222,7 @@ async def main(args):
prompt_adapters=None,
request_logger=request_logger,
chat_template=None,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
) if model_config.task == "generate" else None
openai_serving_embedding = OpenAIServingEmbedding(
engine,