[ Bugfix ] Fix Prometheus Metrics With zeromq Frontend (#7279)
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
This commit is contained in:
@@ -1,13 +1,12 @@
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Counter as CollectionsCounter
|
||||
from typing import Dict, List, Optional, Protocol, Union
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import prometheus_client
|
||||
|
||||
from vllm.engine.metrics_types import (StatLoggerBase, Stats,
|
||||
SupportsMetricsInfo)
|
||||
from vllm.executor.ray_utils import ray
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@@ -29,41 +28,49 @@ prometheus_client.disable_created_metrics()
|
||||
|
||||
# begin-metrics-definitions
|
||||
class Metrics:
|
||||
"""
|
||||
vLLM uses a multiprocessing-based frontend for the OpenAI server.
|
||||
This means that we need to run prometheus_client in multiprocessing mode
|
||||
See https://prometheus.github.io/client_python/multiprocess/ for more
|
||||
details on limitations.
|
||||
"""
|
||||
labelname_finish_reason = "finished_reason"
|
||||
_gauge_cls = prometheus_client.Gauge
|
||||
_counter_cls = prometheus_client.Counter
|
||||
_histogram_cls = prometheus_client.Histogram
|
||||
|
||||
def __init__(self, labelnames: List[str], max_model_len: int):
|
||||
# Unregister any existing vLLM collectors
|
||||
# Unregister any existing vLLM collectors (for CI/CD)
|
||||
self._unregister_vllm_metrics()
|
||||
|
||||
# Config Information
|
||||
self._create_info_cache_config()
|
||||
|
||||
# System stats
|
||||
# Scheduler State
|
||||
self.gauge_scheduler_running = self._gauge_cls(
|
||||
name="vllm:num_requests_running",
|
||||
documentation="Number of requests currently running on GPU.",
|
||||
labelnames=labelnames)
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
self.gauge_scheduler_waiting = self._gauge_cls(
|
||||
name="vllm:num_requests_waiting",
|
||||
documentation="Number of requests waiting to be processed.",
|
||||
labelnames=labelnames)
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
self.gauge_scheduler_swapped = self._gauge_cls(
|
||||
name="vllm:num_requests_swapped",
|
||||
documentation="Number of requests swapped to CPU.",
|
||||
labelnames=labelnames)
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
# KV Cache Usage in %
|
||||
self.gauge_gpu_cache_usage = self._gauge_cls(
|
||||
name="vllm:gpu_cache_usage_perc",
|
||||
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
|
||||
labelnames=labelnames)
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
self.gauge_cpu_cache_usage = self._gauge_cls(
|
||||
name="vllm:cpu_cache_usage_perc",
|
||||
documentation="CPU KV-cache usage. 1 means 100 percent usage.",
|
||||
labelnames=labelnames)
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
|
||||
# Iteration stats
|
||||
self.counter_num_preemption = self._counter_cls(
|
||||
@@ -137,11 +144,13 @@ class Metrics:
|
||||
self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls(
|
||||
name="vllm:spec_decode_draft_acceptance_rate",
|
||||
documentation="Speulative token acceptance rate.",
|
||||
labelnames=labelnames)
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
self.gauge_spec_decode_efficiency = self._gauge_cls(
|
||||
name="vllm:spec_decode_efficiency",
|
||||
documentation="Speculative decoding system efficiency.",
|
||||
labelnames=labelnames)
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
self.counter_spec_decode_num_accepted_tokens = (self._counter_cls(
|
||||
name="vllm:spec_decode_num_accepted_tokens_total",
|
||||
documentation="Number of accepted tokens.",
|
||||
@@ -160,19 +169,18 @@ class Metrics:
|
||||
name="vllm:avg_prompt_throughput_toks_per_s",
|
||||
documentation="Average prefill throughput in tokens/s.",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
# Deprecated in favor of vllm:generation_tokens_total
|
||||
self.gauge_avg_generation_throughput = self._gauge_cls(
|
||||
name="vllm:avg_generation_throughput_toks_per_s",
|
||||
documentation="Average generation throughput in tokens/s.",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
|
||||
def _create_info_cache_config(self) -> None:
|
||||
# Config Information
|
||||
self.info_cache_config = prometheus_client.Info(
|
||||
name='vllm:cache_config',
|
||||
documentation='information of cache_config')
|
||||
|
||||
# end-metrics-definitions
|
||||
|
||||
def _unregister_vllm_metrics(self) -> None:
|
||||
for collector in list(prometheus_client.REGISTRY._collector_to_names):
|
||||
@@ -180,9 +188,6 @@ class Metrics:
|
||||
prometheus_client.REGISTRY.unregister(collector)
|
||||
|
||||
|
||||
# end-metrics-definitions
|
||||
|
||||
|
||||
class _RayGaugeWrapper:
|
||||
"""Wraps around ray.util.metrics.Gauge to provide same API as
|
||||
prometheus_client.Gauge"""
|
||||
@@ -190,7 +195,9 @@ class _RayGaugeWrapper:
|
||||
def __init__(self,
|
||||
name: str,
|
||||
documentation: str = "",
|
||||
labelnames: Optional[List[str]] = None):
|
||||
labelnames: Optional[List[str]] = None,
|
||||
multiprocess_mode: str = ""):
|
||||
del multiprocess_mode
|
||||
labelnames_tuple = tuple(labelnames) if labelnames else None
|
||||
self._gauge = ray_metrics.Gauge(name=name,
|
||||
description=documentation,
|
||||
@@ -268,10 +275,6 @@ class RayMetrics(Metrics):
|
||||
# No-op on purpose
|
||||
pass
|
||||
|
||||
def _create_info_cache_config(self) -> None:
|
||||
# No-op on purpose
|
||||
pass
|
||||
|
||||
|
||||
def build_1_2_5_buckets(max_value: int) -> List[int]:
|
||||
"""
|
||||
@@ -295,46 +298,6 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
|
||||
exponent += 1
|
||||
|
||||
|
||||
@dataclass
|
||||
class Stats:
|
||||
"""Created by LLMEngine for use by StatLogger."""
|
||||
now: float
|
||||
|
||||
# System stats (should have _sys suffix)
|
||||
# Scheduler State
|
||||
num_running_sys: int
|
||||
num_waiting_sys: int
|
||||
num_swapped_sys: int
|
||||
# KV Cache Usage in %
|
||||
gpu_cache_usage_sys: float
|
||||
cpu_cache_usage_sys: float
|
||||
|
||||
# Iteration stats (should have _iter suffix)
|
||||
num_prompt_tokens_iter: int
|
||||
num_generation_tokens_iter: int
|
||||
time_to_first_tokens_iter: List[float]
|
||||
time_per_output_tokens_iter: List[float]
|
||||
num_preemption_iter: int
|
||||
|
||||
# Request stats (should have _requests suffix)
|
||||
# Latency
|
||||
time_e2e_requests: List[float]
|
||||
# Metadata
|
||||
num_prompt_tokens_requests: List[int]
|
||||
num_generation_tokens_requests: List[int]
|
||||
best_of_requests: List[int]
|
||||
n_requests: List[int]
|
||||
finished_reason_requests: List[str]
|
||||
|
||||
spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
|
||||
|
||||
|
||||
class SupportsMetricsInfo(Protocol):
|
||||
|
||||
def metrics_info(self) -> Dict[str, str]:
|
||||
...
|
||||
|
||||
|
||||
def local_interval_elapsed(now: float, last_log: float,
|
||||
local_interval: float) -> bool:
|
||||
elapsed_time = now - last_log
|
||||
@@ -346,38 +309,9 @@ def get_throughput(tracked_stats: List[int], now: float,
|
||||
return float(np.sum(tracked_stats) / (now - last_log))
|
||||
|
||||
|
||||
class StatLoggerBase(ABC):
|
||||
"""Base class for StatLogger."""
|
||||
|
||||
def __init__(self, local_interval: float) -> None:
|
||||
# Tracked stats over current local logging interval.
|
||||
self.num_prompt_tokens: List[int] = []
|
||||
self.num_generation_tokens: List[int] = []
|
||||
self.last_local_log = time.time()
|
||||
self.local_interval = local_interval
|
||||
self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
|
||||
|
||||
@abstractmethod
|
||||
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def log(self, stats: Stats) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def maybe_update_spec_decode_metrics(self, stats: Stats):
|
||||
"""Save spec decode metrics (since they are unlikely
|
||||
to be emitted at same time as log interval)."""
|
||||
if stats.spec_decode_metrics is not None:
|
||||
self.spec_decode_metrics = stats.spec_decode_metrics
|
||||
|
||||
|
||||
class LoggingStatLogger(StatLoggerBase):
|
||||
"""LoggingStatLogger is used in LLMEngine to log to Stdout."""
|
||||
|
||||
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def log(self, stats: Stats) -> None:
|
||||
"""Called by LLMEngine.
|
||||
Logs to Stdout every self.local_interval seconds."""
|
||||
@@ -440,10 +374,14 @@ class LoggingStatLogger(StatLoggerBase):
|
||||
f"Number of draft tokens: {metrics.draft_tokens}, "
|
||||
f"Number of emitted tokens: {metrics.emitted_tokens}.")
|
||||
|
||||
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class PrometheusStatLogger(StatLoggerBase):
|
||||
"""PrometheusStatLogger is used LLMEngine to log to Promethus."""
|
||||
_metrics_cls = Metrics
|
||||
_gauge_cls = prometheus_client.Gauge
|
||||
|
||||
def __init__(self, local_interval: float, labels: Dict[str, str],
|
||||
max_model_len: int) -> None:
|
||||
@@ -453,10 +391,6 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
|
||||
max_model_len=max_model_len)
|
||||
|
||||
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
|
||||
if type == "cache_config":
|
||||
self.metrics.info_cache_config.info(obj.metrics_info())
|
||||
|
||||
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
||||
# Convenience function for logging to gauge.
|
||||
gauge.labels(**self.labels).set(data)
|
||||
@@ -586,6 +520,19 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
self.last_local_log = stats.now
|
||||
self.spec_decode_metrics = None
|
||||
|
||||
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
|
||||
# Info type metrics are syntactic sugar for a gauge permanently set to 1
|
||||
# Since prometheus multiprocessing mode does not support Info, emulate
|
||||
# info here with a gauge.
|
||||
if type == "cache_config":
|
||||
metrics_info = obj.metrics_info()
|
||||
info_gauge = self._gauge_cls(
|
||||
name="vllm:cache_config_info",
|
||||
documentation="Information of the LLMEngine CacheConfig",
|
||||
labelnames=metrics_info.keys(),
|
||||
multiprocess_mode="mostrecent")
|
||||
info_gauge.labels(**metrics_info).set(1)
|
||||
|
||||
|
||||
class RayPrometheusStatLogger(PrometheusStatLogger):
|
||||
"""RayPrometheusStatLogger uses Ray metrics instead."""
|
||||
|
||||
Reference in New Issue
Block a user