[Metrics] Add Prometheus counters for Model FLOPs Utilization (MFU) (#30950)
Export the existing Model FLOPs Utilization (MFU) metrics via Prometheus. `--enable-mfu-metrics` is required for these to be exposed. Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
@@ -22,6 +22,7 @@ METRIC_SOURCE_FILES = [
|
||||
"path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
|
||||
"output": "nixl_connector.inc.md",
|
||||
},
|
||||
{"path": "vllm/v1/metrics/perf.py", "output": "perf.inc.md"},
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -45,6 +45,12 @@ The following metrics are exposed:
|
||||
|
||||
--8<-- "docs/generated/metrics/nixl_connector.inc.md"
|
||||
|
||||
## Model Flops Utilization (MFU) Performance Metrics
|
||||
|
||||
These metrics are available via `--enable-mfu-metrics`:
|
||||
|
||||
--8<-- "docs/generated/metrics/perf.inc.md"
|
||||
|
||||
## Deprecation Policy
|
||||
|
||||
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
|
||||
|
||||
@@ -19,7 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
|
||||
from vllm.logger import init_logger
|
||||
from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
|
||||
from vllm.v1.engine import FinishReason
|
||||
from vllm.v1.metrics.perf import PerfMetricsLogging
|
||||
from vllm.v1.metrics.perf import PerfMetricsLogging, PerfMetricsProm
|
||||
from vllm.v1.metrics.prometheus import unregister_vllm_metrics
|
||||
from vllm.v1.metrics.stats import (
|
||||
CachingMetrics,
|
||||
@@ -392,6 +392,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
||||
_histogram_cls = Histogram
|
||||
_spec_decoding_cls = SpecDecodingProm
|
||||
_kv_connector_cls = KVConnectorPrometheus
|
||||
_perf_metrics_cls = PerfMetricsProm
|
||||
|
||||
def __init__(
|
||||
self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None
|
||||
@@ -424,6 +425,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
||||
self.kv_connector_prom = self._kv_connector_cls(
|
||||
vllm_config, labelnames, per_engine_labelvalues
|
||||
)
|
||||
self.perf_metrics_prom = self._perf_metrics_cls(
|
||||
vllm_config, labelnames, per_engine_labelvalues
|
||||
)
|
||||
|
||||
#
|
||||
# Scheduler state
|
||||
@@ -1065,6 +1069,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
||||
scheduler_stats.kv_connector_stats, engine_idx
|
||||
)
|
||||
|
||||
if scheduler_stats.perf_stats is not None:
|
||||
self.perf_metrics_prom.observe(scheduler_stats.perf_stats, engine_idx)
|
||||
|
||||
if (
|
||||
self.kv_cache_metrics_enabled
|
||||
and scheduler_stats.kv_cache_eviction_events
|
||||
|
||||
@@ -13,6 +13,7 @@ from collections.abc import Iterable
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any, Protocol
|
||||
|
||||
import prometheus_client
|
||||
import torch
|
||||
from pydantic import BaseModel, Field, ValidationError, model_validator
|
||||
from typing_extensions import Self
|
||||
@@ -1233,6 +1234,87 @@ class PerfMetricsLogging:
|
||||
self.reset()
|
||||
|
||||
|
||||
#### Prometheus Integration ####
|
||||
|
||||
|
||||
class PerfMetricsProm:
|
||||
"""Record performance metrics in Prometheus.
|
||||
|
||||
Average TFLOPS (tera floating-point operations per second) can be
|
||||
calculated using a PromQL query:
|
||||
|
||||
rate(vllm:estimated_flops_per_gpu_total[1m]) / 1e12
|
||||
|
||||
Average memory bandwidth in GB/s can be calculated using:
|
||||
|
||||
(rate(vllm:estimated_read_bytes_per_gpu_total[1m]) +
|
||||
rate(vllm:estimated_write_bytes_per_gpu_total[1m])) / 1e9
|
||||
"""
|
||||
|
||||
_counter_cls = prometheus_client.Counter
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
labelnames: list[str],
|
||||
per_engine_labelvalues: dict[int, list[object]],
|
||||
):
|
||||
counter_flops = self._counter_cls(
|
||||
name="vllm:estimated_flops_per_gpu_total",
|
||||
documentation=(
|
||||
"Estimated number of floating point operations per GPU "
|
||||
"(for Model Flops Utilization calculations)."
|
||||
),
|
||||
labelnames=labelnames,
|
||||
)
|
||||
self.counter_flops = make_per_engine(counter_flops, per_engine_labelvalues)
|
||||
|
||||
counter_read_bytes = self._counter_cls(
|
||||
name="vllm:estimated_read_bytes_per_gpu_total",
|
||||
documentation=(
|
||||
"Estimated number of bytes read from memory per GPU "
|
||||
"(for Model Flops Utilization calculations)."
|
||||
),
|
||||
labelnames=labelnames,
|
||||
)
|
||||
self.counter_read_bytes = make_per_engine(
|
||||
counter_read_bytes, per_engine_labelvalues
|
||||
)
|
||||
|
||||
counter_write_bytes = self._counter_cls(
|
||||
name="vllm:estimated_write_bytes_per_gpu_total",
|
||||
documentation=(
|
||||
"Estimated number of bytes written to memory per GPU "
|
||||
"(for Model Flops Utilization calculations)."
|
||||
),
|
||||
labelnames=labelnames,
|
||||
)
|
||||
self.counter_write_bytes = make_per_engine(
|
||||
counter_write_bytes, per_engine_labelvalues
|
||||
)
|
||||
|
||||
def observe(self, perf_stats: PerfStats, engine_idx: int = 0):
|
||||
if not (
|
||||
perf_stats.num_flops_per_gpu
|
||||
or perf_stats.num_read_bytes_per_gpu
|
||||
or perf_stats.num_write_bytes_per_gpu
|
||||
):
|
||||
return
|
||||
self.counter_flops[engine_idx].inc(perf_stats.num_flops_per_gpu)
|
||||
self.counter_read_bytes[engine_idx].inc(perf_stats.num_read_bytes_per_gpu)
|
||||
self.counter_write_bytes[engine_idx].inc(perf_stats.num_write_bytes_per_gpu)
|
||||
|
||||
|
||||
def make_per_engine(
|
||||
counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[object]]
|
||||
):
|
||||
"""Create a counter for each label value."""
|
||||
return {
|
||||
idx: counter.labels(*labelvalues)
|
||||
for idx, labelvalues in per_engine_labelvalues.items()
|
||||
}
|
||||
|
||||
|
||||
## util functions
|
||||
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import time
|
||||
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorPrometheus
|
||||
from vllm.v1.metrics.loggers import PrometheusStatLogger
|
||||
from vllm.v1.metrics.perf import PerfMetricsProm
|
||||
from vllm.v1.spec_decode.metrics import SpecDecodingProm
|
||||
|
||||
try:
|
||||
@@ -179,6 +180,16 @@ class RayKVConnectorPrometheus(KVConnectorPrometheus):
|
||||
_histogram_cls = RayHistogramWrapper
|
||||
|
||||
|
||||
class RayPerfMetricsProm(PerfMetricsProm):
|
||||
"""
|
||||
RayPerfMetricsProm is used by RayMetrics to log Ray
|
||||
metrics. Provides the same MFU metrics as PerfMetricsProm
|
||||
uses Ray's util.metrics library.
|
||||
"""
|
||||
|
||||
_counter_cls = RayCounterWrapper
|
||||
|
||||
|
||||
class RayPrometheusStatLogger(PrometheusStatLogger):
|
||||
"""RayPrometheusStatLogger uses Ray metrics instead."""
|
||||
|
||||
@@ -187,6 +198,7 @@ class RayPrometheusStatLogger(PrometheusStatLogger):
|
||||
_histogram_cls = RayHistogramWrapper
|
||||
_spec_decoding_cls = RaySpecDecodingProm
|
||||
_kv_connector_cls = RayKVConnectorPrometheus
|
||||
_perf_metrics_cls = RayPerfMetricsProm
|
||||
|
||||
@staticmethod
|
||||
def _unregister_vllm_metrics():
|
||||
|
||||
Reference in New Issue
Block a user