[Misc] Add metrics for request queue time, forward time, and execute time (#9659)

This commit is contained in:
科英
2024-10-30 01:32:56 +08:00
committed by GitHub
parent 622b7ab955
commit 74fc2d77ae
4 changed files with 70 additions and 15 deletions

View File

@@ -133,7 +133,31 @@ class Metrics:
name="vllm:e2e_request_latency_seconds",
documentation="Histogram of end to end request latency in seconds.",
labelnames=labelnames,
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
buckets=[
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
40.0, 50.0, 60.0
])
self.histogram_time_in_queue_request = self._histogram_cls(
name="vllm:time_in_queue_requests",
documentation=
"Histogram of time the request spent in the queue in seconds.",
labelnames=labelnames,
buckets=[
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
40.0, 50.0, 60.0
])
self.histogram_model_forward_time_request = self._histogram_cls(
name="vllm:model_forward_time_milliseconds",
documentation=
"Histogram of time spent in the model forward pass in ms.",
labelnames=labelnames,
buckets=build_1_2_3_5_8_buckets(3000))
self.histogram_model_execute_time_request = self._histogram_cls(
name="vllm:model_execute_time_milliseconds",
documentation=
"Histogram of time spent in the model execute function in ms.",
labelnames=labelnames,
buckets=build_1_2_3_5_8_buckets(3000))
# Metadata
self.histogram_num_prompt_tokens_request = self._histogram_cls(
name="vllm:request_prompt_tokens",
@@ -299,16 +323,12 @@ class RayMetrics(Metrics):
pass
def build_1_2_5_buckets(max_value: int) -> List[int]:
def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
"""
Builds a list of buckets with increasing powers of 10 multiplied by
mantissa values (1, 2, 5) until the value exceeds the specified maximum.
Builds a list of buckets with increasing powers of 10 multiplied by
mantissa values until the value exceeds the specified maximum.
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
mantissa_lst = [1, 2, 5]
exponent = 0
buckets: List[int] = []
while True:
@@ -321,6 +341,24 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
exponent += 1
def build_1_2_5_buckets(max_value: int) -> List[int]:
"""
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
return build_buckets([1, 2, 5], max_value)
def build_1_2_3_5_8_buckets(max_value: int) -> List[int]:
"""
Example:
>>> build_1_2_3_5_8_buckets(100)
[1, 2, 3, 5, 8, 10, 20, 30, 50, 80, 100]
"""
return build_buckets([1, 2, 3, 5, 8], max_value)
def local_interval_elapsed(now: float, last_log: float,
local_interval: float) -> bool:
elapsed_time = now - last_log
@@ -486,6 +524,12 @@ class PrometheusStatLogger(StatLoggerBase):
# Latency
self._log_histogram(self.metrics.histogram_e2e_time_request,
stats.time_e2e_requests)
self._log_histogram(self.metrics.histogram_time_in_queue_request,
stats.time_in_queue_requests)
self._log_histogram(self.metrics.histogram_model_forward_time_request,
stats.model_forward_time_requests)
self._log_histogram(self.metrics.histogram_model_execute_time_request,
stats.model_execute_time_requests)
# Metadata
finished_reason_counter = CollectionsCounter(
stats.finished_reason_requests)