[Frontend] Add max_tokens prometheus metric (#9881)

Signed-off-by: Tomer Asida <tomera@ai21.com>
2024-11-05 00:53:24 +02:00
parent 9a5664d4a4
commit ac04a97a9f
5 changed files with 23 additions and 2 deletions
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -179,6 +179,12 @@ class Metrics:
            labelnames=labelnames,
            buckets=[1, 2, 5, 10, 20],
        )
+        self.histogram_max_tokens_request = self._histogram_cls(
+            name="vllm:request_params_max_tokens",
+            documentation="Histogram of the max_tokens request parameter.",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len),
+        )
        self.counter_request_success = self._counter_cls(
            name="vllm:request_success_total",
            documentation="Count of successfully processed requests.",
@@ -547,6 +553,8 @@ class PrometheusStatLogger(StatLoggerBase):
            self.metrics.histogram_num_generation_tokens_request,
            stats.num_generation_tokens_requests)
        self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
+        self._log_histogram(self.metrics.histogram_max_tokens_request,
+                            stats.max_tokens_requests)

    def _log_prometheus_interval(self, prompt_throughput: float,
                                 generation_throughput: float) -> None: