[V1][Metrics] Add several request timing histograms (#12644)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
@@ -85,6 +85,10 @@ EXPECTED_VALUES = {
|
||||
"vllm:time_per_output_token_seconds":
|
||||
[("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
|
||||
"vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
|
||||
"vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
|
||||
"vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
|
||||
"vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
|
||||
"vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
|
||||
"vllm:request_prompt_tokens":
|
||||
[("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
|
||||
("_count", _NUM_REQUESTS)],
|
||||
@@ -169,6 +173,18 @@ EXPECTED_METRICS = [
|
||||
"vllm:e2e_request_latency_seconds_sum",
|
||||
"vllm:e2e_request_latency_seconds_bucket",
|
||||
"vllm:e2e_request_latency_seconds_count",
|
||||
"vllm:request_queue_time_seconds_sum",
|
||||
"vllm:request_queue_time_seconds_bucket",
|
||||
"vllm:request_queue_time_seconds_count",
|
||||
"vllm:request_inference_time_seconds_sum",
|
||||
"vllm:request_inference_time_seconds_bucket",
|
||||
"vllm:request_inference_time_seconds_count",
|
||||
"vllm:request_prefill_time_seconds_sum",
|
||||
"vllm:request_prefill_time_seconds_bucket",
|
||||
"vllm:request_prefill_time_seconds_count",
|
||||
"vllm:request_decode_time_seconds_sum",
|
||||
"vllm:request_decode_time_seconds_bucket",
|
||||
"vllm:request_decode_time_seconds_count",
|
||||
"vllm:request_prompt_tokens_sum",
|
||||
"vllm:request_prompt_tokens_bucket",
|
||||
"vllm:request_prompt_tokens_count",
|
||||
@@ -220,6 +236,21 @@ EXPECTED_METRICS_V1 = [
|
||||
"vllm:time_per_output_token_seconds_sum",
|
||||
"vllm:time_per_output_token_seconds_bucket",
|
||||
"vllm:time_per_output_token_seconds_count",
|
||||
"vllm:e2e_request_latency_seconds_sum",
|
||||
"vllm:e2e_request_latency_seconds_bucket",
|
||||
"vllm:e2e_request_latency_seconds_count",
|
||||
"vllm:request_queue_time_seconds_sum",
|
||||
"vllm:request_queue_time_seconds_bucket",
|
||||
"vllm:request_queue_time_seconds_count",
|
||||
"vllm:request_inference_time_seconds_sum",
|
||||
"vllm:request_inference_time_seconds_bucket",
|
||||
"vllm:request_inference_time_seconds_count",
|
||||
"vllm:request_prefill_time_seconds_sum",
|
||||
"vllm:request_prefill_time_seconds_bucket",
|
||||
"vllm:request_prefill_time_seconds_count",
|
||||
"vllm:request_decode_time_seconds_sum",
|
||||
"vllm:request_decode_time_seconds_bucket",
|
||||
"vllm:request_decode_time_seconds_count",
|
||||
]
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user