[Metrics] Add Prometheus counters for Model FLOPs Utilization (MFU) (#30950)

Export the existing Model FLOPs Utilization (MFU) metrics via Prometheus.

`--enable-mfu-metrics` is required for these to be exposed.

Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
Mark McLoughlin
2026-02-23 15:01:07 +00:00
committed by GitHub
parent b95bb6927f
commit 5cc7c4452e
5 changed files with 109 additions and 1 deletions

View File

@@ -22,6 +22,7 @@ METRIC_SOURCE_FILES = [
"path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py", "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
"output": "nixl_connector.inc.md", "output": "nixl_connector.inc.md",
}, },
{"path": "vllm/v1/metrics/perf.py", "output": "perf.inc.md"},
] ]

View File

@@ -45,6 +45,12 @@ The following metrics are exposed:
--8<-- "docs/generated/metrics/nixl_connector.inc.md" --8<-- "docs/generated/metrics/nixl_connector.inc.md"
## Model Flops Utilization (MFU) Performance Metrics
These metrics are available via `--enable-mfu-metrics`:
--8<-- "docs/generated/metrics/perf.inc.md"
## Deprecation Policy ## Deprecation Policy
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1` Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`

View File

@@ -19,7 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
from vllm.v1.engine import FinishReason from vllm.v1.engine import FinishReason
from vllm.v1.metrics.perf import PerfMetricsLogging from vllm.v1.metrics.perf import PerfMetricsLogging, PerfMetricsProm
from vllm.v1.metrics.prometheus import unregister_vllm_metrics from vllm.v1.metrics.prometheus import unregister_vllm_metrics
from vllm.v1.metrics.stats import ( from vllm.v1.metrics.stats import (
CachingMetrics, CachingMetrics,
@@ -392,6 +392,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
_histogram_cls = Histogram _histogram_cls = Histogram
_spec_decoding_cls = SpecDecodingProm _spec_decoding_cls = SpecDecodingProm
_kv_connector_cls = KVConnectorPrometheus _kv_connector_cls = KVConnectorPrometheus
_perf_metrics_cls = PerfMetricsProm
def __init__( def __init__(
self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None
@@ -424,6 +425,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
self.kv_connector_prom = self._kv_connector_cls( self.kv_connector_prom = self._kv_connector_cls(
vllm_config, labelnames, per_engine_labelvalues vllm_config, labelnames, per_engine_labelvalues
) )
self.perf_metrics_prom = self._perf_metrics_cls(
vllm_config, labelnames, per_engine_labelvalues
)
# #
# Scheduler state # Scheduler state
@@ -1065,6 +1069,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
scheduler_stats.kv_connector_stats, engine_idx scheduler_stats.kv_connector_stats, engine_idx
) )
if scheduler_stats.perf_stats is not None:
self.perf_metrics_prom.observe(scheduler_stats.perf_stats, engine_idx)
if ( if (
self.kv_cache_metrics_enabled self.kv_cache_metrics_enabled
and scheduler_stats.kv_cache_eviction_events and scheduler_stats.kv_cache_eviction_events

View File

@@ -13,6 +13,7 @@ from collections.abc import Iterable
from dataclasses import asdict, dataclass from dataclasses import asdict, dataclass
from typing import Any, Protocol from typing import Any, Protocol
import prometheus_client
import torch import torch
from pydantic import BaseModel, Field, ValidationError, model_validator from pydantic import BaseModel, Field, ValidationError, model_validator
from typing_extensions import Self from typing_extensions import Self
@@ -1233,6 +1234,87 @@ class PerfMetricsLogging:
self.reset() self.reset()
#### Prometheus Integration ####
class PerfMetricsProm:
"""Record performance metrics in Prometheus.
Average TFLOPS (tera floating-point operations per second) can be
calculated using a PromQL query:
rate(vllm:estimated_flops_per_gpu_total[1m]) / 1e12
Average memory bandwidth in GB/s can be calculated using:
(rate(vllm:estimated_read_bytes_per_gpu_total[1m]) +
rate(vllm:estimated_write_bytes_per_gpu_total[1m])) / 1e9
"""
_counter_cls = prometheus_client.Counter
def __init__(
self,
vllm_config: VllmConfig,
labelnames: list[str],
per_engine_labelvalues: dict[int, list[object]],
):
counter_flops = self._counter_cls(
name="vllm:estimated_flops_per_gpu_total",
documentation=(
"Estimated number of floating point operations per GPU "
"(for Model Flops Utilization calculations)."
),
labelnames=labelnames,
)
self.counter_flops = make_per_engine(counter_flops, per_engine_labelvalues)
counter_read_bytes = self._counter_cls(
name="vllm:estimated_read_bytes_per_gpu_total",
documentation=(
"Estimated number of bytes read from memory per GPU "
"(for Model Flops Utilization calculations)."
),
labelnames=labelnames,
)
self.counter_read_bytes = make_per_engine(
counter_read_bytes, per_engine_labelvalues
)
counter_write_bytes = self._counter_cls(
name="vllm:estimated_write_bytes_per_gpu_total",
documentation=(
"Estimated number of bytes written to memory per GPU "
"(for Model Flops Utilization calculations)."
),
labelnames=labelnames,
)
self.counter_write_bytes = make_per_engine(
counter_write_bytes, per_engine_labelvalues
)
def observe(self, perf_stats: PerfStats, engine_idx: int = 0):
if not (
perf_stats.num_flops_per_gpu
or perf_stats.num_read_bytes_per_gpu
or perf_stats.num_write_bytes_per_gpu
):
return
self.counter_flops[engine_idx].inc(perf_stats.num_flops_per_gpu)
self.counter_read_bytes[engine_idx].inc(perf_stats.num_read_bytes_per_gpu)
self.counter_write_bytes[engine_idx].inc(perf_stats.num_write_bytes_per_gpu)
def make_per_engine(
counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[object]]
):
"""Create a counter for each label value."""
return {
idx: counter.labels(*labelvalues)
for idx, labelvalues in per_engine_labelvalues.items()
}
## util functions ## util functions

View File

@@ -4,6 +4,7 @@ import time
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorPrometheus from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorPrometheus
from vllm.v1.metrics.loggers import PrometheusStatLogger from vllm.v1.metrics.loggers import PrometheusStatLogger
from vllm.v1.metrics.perf import PerfMetricsProm
from vllm.v1.spec_decode.metrics import SpecDecodingProm from vllm.v1.spec_decode.metrics import SpecDecodingProm
try: try:
@@ -179,6 +180,16 @@ class RayKVConnectorPrometheus(KVConnectorPrometheus):
_histogram_cls = RayHistogramWrapper _histogram_cls = RayHistogramWrapper
class RayPerfMetricsProm(PerfMetricsProm):
"""
RayPerfMetricsProm is used by RayMetrics to log Ray
metrics. Provides the same MFU metrics as PerfMetricsProm
uses Ray's util.metrics library.
"""
_counter_cls = RayCounterWrapper
class RayPrometheusStatLogger(PrometheusStatLogger): class RayPrometheusStatLogger(PrometheusStatLogger):
"""RayPrometheusStatLogger uses Ray metrics instead.""" """RayPrometheusStatLogger uses Ray metrics instead."""
@@ -187,6 +198,7 @@ class RayPrometheusStatLogger(PrometheusStatLogger):
_histogram_cls = RayHistogramWrapper _histogram_cls = RayHistogramWrapper
_spec_decoding_cls = RaySpecDecodingProm _spec_decoding_cls = RaySpecDecodingProm
_kv_connector_cls = RayKVConnectorPrometheus _kv_connector_cls = RayKVConnectorPrometheus
_perf_metrics_cls = RayPerfMetricsProm
@staticmethod @staticmethod
def _unregister_vllm_metrics(): def _unregister_vllm_metrics():