[MISC] Add lora requests to metrics (#9477)
Co-authored-by: Kunjan Patel <kunjanp_google_com@vllm.us-central1-a.c.kunjanp-gke-dev-2.internal>
This commit is contained in:
@@ -34,7 +34,11 @@ class Metrics:
|
||||
See https://prometheus.github.io/client_python/multiprocess/ for more
|
||||
details on limitations.
|
||||
"""
|
||||
|
||||
labelname_finish_reason = "finished_reason"
|
||||
labelname_waiting_lora_adapters = "waiting_lora_adapters"
|
||||
labelname_running_lora_adapters = "running_lora_adapters"
|
||||
labelname_max_lora = "max_lora"
|
||||
_gauge_cls = prometheus_client.Gauge
|
||||
_counter_cls = prometheus_client.Counter
|
||||
_histogram_cls = prometheus_client.Histogram
|
||||
@@ -55,6 +59,16 @@ class Metrics:
|
||||
documentation="Number of requests waiting to be processed.",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
self.gauge_lora_info = self._gauge_cls(
|
||||
name="vllm:lora_requests_info",
|
||||
documentation="Running stats on lora requests.",
|
||||
labelnames=[
|
||||
self.labelname_running_lora_adapters,
|
||||
self.labelname_max_lora,
|
||||
self.labelname_waiting_lora_adapters,
|
||||
],
|
||||
multiprocess_mode="livemostrecent",
|
||||
)
|
||||
self.gauge_scheduler_swapped = self._gauge_cls(
|
||||
name="vllm:num_requests_swapped",
|
||||
documentation="Number of requests swapped to CPU.",
|
||||
@@ -426,6 +440,9 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
for datum in data:
|
||||
histogram.labels(**self.labels).observe(datum)
|
||||
|
||||
def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None:
|
||||
gauge.labels(**data).set(1)
|
||||
|
||||
def _log_prometheus(self, stats: Stats) -> None:
|
||||
# System state data
|
||||
self._log_gauge(self.metrics.gauge_scheduler_running,
|
||||
@@ -442,7 +459,17 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
stats.cpu_prefix_cache_hit_rate)
|
||||
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
|
||||
stats.gpu_prefix_cache_hit_rate)
|
||||
|
||||
# Including max-lora in metric, in future this property of lora
|
||||
# config maybe extended to be dynamic.
|
||||
lora_info = {
|
||||
self.metrics.labelname_running_lora_adapters:
|
||||
",".join(stats.running_lora_adapters),
|
||||
self.metrics.labelname_waiting_lora_adapters:
|
||||
",".join(stats.waiting_lora_adapters),
|
||||
self.metrics.labelname_max_lora:
|
||||
stats.max_lora,
|
||||
}
|
||||
self._log_gauge_string(self.metrics.gauge_lora_info, lora_info)
|
||||
# Iteration level data
|
||||
self._log_counter(self.metrics.counter_num_preemption,
|
||||
stats.num_preemption_iter)
|
||||
|
||||
Reference in New Issue
Block a user