[ Bugfix ] Fix Prometheus Metrics With zeromq Frontend (#7279)

Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-08-18 16:19:48 -04:00
parent ab7165f2c7
commit e3b318216d
7 changed files with 366 additions and 116 deletions
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,13 +1,12 @@
-import time
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
 from typing import TYPE_CHECKING
 from typing import Counter as CollectionsCounter
-from typing import Dict, List, Optional, Protocol, Union
+from typing import Dict, List, Optional, Union

 import numpy as np
 import prometheus_client

+from vllm.engine.metrics_types import (StatLoggerBase, Stats,
+                                       SupportsMetricsInfo)
 from vllm.executor.ray_utils import ray
 from vllm.logger import init_logger

@@ -29,41 +28,49 @@ prometheus_client.disable_created_metrics()

 # begin-metrics-definitions
 class Metrics:
+    """
+    vLLM uses a multiprocessing-based frontend for the OpenAI server.
+    This means that we need to run prometheus_client in multiprocessing mode
+    See https://prometheus.github.io/client_python/multiprocess/ for more
+    details on limitations.
+    """
    labelname_finish_reason = "finished_reason"
    _gauge_cls = prometheus_client.Gauge
    _counter_cls = prometheus_client.Counter
    _histogram_cls = prometheus_client.Histogram

    def __init__(self, labelnames: List[str], max_model_len: int):
-        # Unregister any existing vLLM collectors
+        # Unregister any existing vLLM collectors (for CI/CD)
        self._unregister_vllm_metrics()

-        # Config Information
-        self._create_info_cache_config()
-
        # System stats
        #   Scheduler State
        self.gauge_scheduler_running = self._gauge_cls(
            name="vllm:num_requests_running",
            documentation="Number of requests currently running on GPU.",
-            labelnames=labelnames)
+            labelnames=labelnames,
+            multiprocess_mode="sum")
        self.gauge_scheduler_waiting = self._gauge_cls(
            name="vllm:num_requests_waiting",
            documentation="Number of requests waiting to be processed.",
-            labelnames=labelnames)
+            labelnames=labelnames,
+            multiprocess_mode="sum")
        self.gauge_scheduler_swapped = self._gauge_cls(
            name="vllm:num_requests_swapped",
            documentation="Number of requests swapped to CPU.",
-            labelnames=labelnames)
+            labelnames=labelnames,
+            multiprocess_mode="sum")
        #   KV Cache Usage in %
        self.gauge_gpu_cache_usage = self._gauge_cls(
            name="vllm:gpu_cache_usage_perc",
            documentation="GPU KV-cache usage. 1 means 100 percent usage.",
-            labelnames=labelnames)
+            labelnames=labelnames,
+            multiprocess_mode="sum")
        self.gauge_cpu_cache_usage = self._gauge_cls(
            name="vllm:cpu_cache_usage_perc",
            documentation="CPU KV-cache usage. 1 means 100 percent usage.",
-            labelnames=labelnames)
+            labelnames=labelnames,
+            multiprocess_mode="sum")

        # Iteration stats
        self.counter_num_preemption = self._counter_cls(
@@ -137,11 +144,13 @@ class Metrics:
        self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls(
            name="vllm:spec_decode_draft_acceptance_rate",
            documentation="Speulative token acceptance rate.",
-            labelnames=labelnames)
+            labelnames=labelnames,
+            multiprocess_mode="sum")
        self.gauge_spec_decode_efficiency = self._gauge_cls(
            name="vllm:spec_decode_efficiency",
            documentation="Speculative decoding system efficiency.",
-            labelnames=labelnames)
+            labelnames=labelnames,
+            multiprocess_mode="sum")
        self.counter_spec_decode_num_accepted_tokens = (self._counter_cls(
            name="vllm:spec_decode_num_accepted_tokens_total",
            documentation="Number of accepted tokens.",
@@ -160,19 +169,18 @@ class Metrics:
            name="vllm:avg_prompt_throughput_toks_per_s",
            documentation="Average prefill throughput in tokens/s.",
            labelnames=labelnames,
+            multiprocess_mode="sum",
        )
        # Deprecated in favor of vllm:generation_tokens_total
        self.gauge_avg_generation_throughput = self._gauge_cls(
            name="vllm:avg_generation_throughput_toks_per_s",
            documentation="Average generation throughput in tokens/s.",
            labelnames=labelnames,
+            multiprocess_mode="sum",
        )

-    def _create_info_cache_config(self) -> None:
-        # Config Information
-        self.info_cache_config = prometheus_client.Info(
-            name='vllm:cache_config',
-            documentation='information of cache_config')
+
+# end-metrics-definitions

    def _unregister_vllm_metrics(self) -> None:
        for collector in list(prometheus_client.REGISTRY._collector_to_names):
@@ -180,9 +188,6 @@ class Metrics:
                prometheus_client.REGISTRY.unregister(collector)


-# end-metrics-definitions
-
-
 class _RayGaugeWrapper:
    """Wraps around ray.util.metrics.Gauge to provide same API as
    prometheus_client.Gauge"""
@@ -190,7 +195,9 @@ class _RayGaugeWrapper:
    def __init__(self,
                 name: str,
                 documentation: str = "",
-                 labelnames: Optional[List[str]] = None):
+                 labelnames: Optional[List[str]] = None,
+                 multiprocess_mode: str = ""):
+        del multiprocess_mode
        labelnames_tuple = tuple(labelnames) if labelnames else None
        self._gauge = ray_metrics.Gauge(name=name,
                                        description=documentation,
@@ -268,10 +275,6 @@ class RayMetrics(Metrics):
        # No-op on purpose
        pass

-    def _create_info_cache_config(self) -> None:
-        # No-op on purpose
-        pass
-

 def build_1_2_5_buckets(max_value: int) -> List[int]:
    """
@@ -295,46 +298,6 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
        exponent += 1


-@dataclass
-class Stats:
-    """Created by LLMEngine for use by StatLogger."""
-    now: float
-
-    # System stats (should have _sys suffix)
-    #   Scheduler State
-    num_running_sys: int
-    num_waiting_sys: int
-    num_swapped_sys: int
-    #   KV Cache Usage in %
-    gpu_cache_usage_sys: float
-    cpu_cache_usage_sys: float
-
-    # Iteration stats (should have _iter suffix)
-    num_prompt_tokens_iter: int
-    num_generation_tokens_iter: int
-    time_to_first_tokens_iter: List[float]
-    time_per_output_tokens_iter: List[float]
-    num_preemption_iter: int
-
-    # Request stats (should have _requests suffix)
-    #   Latency
-    time_e2e_requests: List[float]
-    #   Metadata
-    num_prompt_tokens_requests: List[int]
-    num_generation_tokens_requests: List[int]
-    best_of_requests: List[int]
-    n_requests: List[int]
-    finished_reason_requests: List[str]
-
-    spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
-
-
-class SupportsMetricsInfo(Protocol):
-
-    def metrics_info(self) -> Dict[str, str]:
-        ...
-
-
 def local_interval_elapsed(now: float, last_log: float,
                           local_interval: float) -> bool:
    elapsed_time = now - last_log
@@ -346,38 +309,9 @@ def get_throughput(tracked_stats: List[int], now: float,
    return float(np.sum(tracked_stats) / (now - last_log))


-class StatLoggerBase(ABC):
-    """Base class for StatLogger."""
-
-    def __init__(self, local_interval: float) -> None:
-        # Tracked stats over current local logging interval.
-        self.num_prompt_tokens: List[int] = []
-        self.num_generation_tokens: List[int] = []
-        self.last_local_log = time.time()
-        self.local_interval = local_interval
-        self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
-
-    @abstractmethod
-    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
-        raise NotImplementedError
-
-    @abstractmethod
-    def log(self, stats: Stats) -> None:
-        raise NotImplementedError
-
-    def maybe_update_spec_decode_metrics(self, stats: Stats):
-        """Save spec decode metrics (since they are unlikely
-        to be emitted at same time as log interval)."""
-        if stats.spec_decode_metrics is not None:
-            self.spec_decode_metrics = stats.spec_decode_metrics
-
-
 class LoggingStatLogger(StatLoggerBase):
    """LoggingStatLogger is used in LLMEngine to log to Stdout."""

-    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
-        raise NotImplementedError
-
    def log(self, stats: Stats) -> None:
        """Called by LLMEngine.
           Logs to Stdout every self.local_interval seconds."""
@@ -440,10 +374,14 @@ class LoggingStatLogger(StatLoggerBase):
                f"Number of draft tokens: {metrics.draft_tokens}, "
                f"Number of emitted tokens: {metrics.emitted_tokens}.")

+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        raise NotImplementedError
+

 class PrometheusStatLogger(StatLoggerBase):
    """PrometheusStatLogger is used LLMEngine to log to Promethus."""
    _metrics_cls = Metrics
+    _gauge_cls = prometheus_client.Gauge

    def __init__(self, local_interval: float, labels: Dict[str, str],
                 max_model_len: int) -> None:
@@ -453,10 +391,6 @@ class PrometheusStatLogger(StatLoggerBase):
        self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
                                         max_model_len=max_model_len)

-    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
-        if type == "cache_config":
-            self.metrics.info_cache_config.info(obj.metrics_info())
-
    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
        # Convenience function for logging to gauge.
        gauge.labels(**self.labels).set(data)
@@ -586,6 +520,19 @@ class PrometheusStatLogger(StatLoggerBase):
            self.last_local_log = stats.now
            self.spec_decode_metrics = None

+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        # Info type metrics are syntactic sugar for a gauge permanently set to 1
+        # Since prometheus multiprocessing mode does not support Info, emulate
+        # info here with a gauge.
+        if type == "cache_config":
+            metrics_info = obj.metrics_info()
+            info_gauge = self._gauge_cls(
+                name="vllm:cache_config_info",
+                documentation="Information of the LLMEngine CacheConfig",
+                labelnames=metrics_info.keys(),
+                multiprocess_mode="mostrecent")
+            info_gauge.labels(**metrics_info).set(1)
+

 class RayPrometheusStatLogger(PrometheusStatLogger):
    """RayPrometheusStatLogger uses Ray metrics instead."""