[Metrics] Add Prometheus counters for Model FLOPs Utilization (MFU) (#30950)

Export the existing Model FLOPs Utilization (MFU) metrics via Prometheus. `--enable-mfu-metrics` is required for these to be exposed. Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2026-02-23 15:01:07 +00:00
parent b95bb6927f
commit 5cc7c4452e
5 changed files with 109 additions and 1 deletions
--- a/docs/mkdocs/hooks/generate_metrics.py
+++ b/docs/mkdocs/hooks/generate_metrics.py
@@ -22,6 +22,7 @@ METRIC_SOURCE_FILES = [
        "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
        "output": "nixl_connector.inc.md",
    },
+    {"path": "vllm/v1/metrics/perf.py", "output": "perf.inc.md"},
 ]


--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@@ -45,6 +45,12 @@ The following metrics are exposed:

 --8<-- "docs/generated/metrics/nixl_connector.inc.md"

+## Model Flops Utilization (MFU) Performance Metrics
+
+These metrics are available via `--enable-mfu-metrics`:
+
+--8<-- "docs/generated/metrics/perf.inc.md"
+
 ## Deprecation Policy

 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -19,7 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
 from vllm.logger import init_logger
 from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
 from vllm.v1.engine import FinishReason
-from vllm.v1.metrics.perf import PerfMetricsLogging
+from vllm.v1.metrics.perf import PerfMetricsLogging, PerfMetricsProm
 from vllm.v1.metrics.prometheus import unregister_vllm_metrics
 from vllm.v1.metrics.stats import (
    CachingMetrics,
@@ -392,6 +392,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
    _histogram_cls = Histogram
    _spec_decoding_cls = SpecDecodingProm
    _kv_connector_cls = KVConnectorPrometheus
+    _perf_metrics_cls = PerfMetricsProm

    def __init__(
        self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None
@@ -424,6 +425,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
        self.kv_connector_prom = self._kv_connector_cls(
            vllm_config, labelnames, per_engine_labelvalues
        )
+        self.perf_metrics_prom = self._perf_metrics_cls(
+            vllm_config, labelnames, per_engine_labelvalues
+        )

        #
        # Scheduler state
@@ -1065,6 +1069,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
                    scheduler_stats.kv_connector_stats, engine_idx
                )

+            if scheduler_stats.perf_stats is not None:
+                self.perf_metrics_prom.observe(scheduler_stats.perf_stats, engine_idx)
+
            if (
                self.kv_cache_metrics_enabled
                and scheduler_stats.kv_cache_eviction_events
--- a/vllm/v1/metrics/perf.py
+++ b/vllm/v1/metrics/perf.py
@@ -13,6 +13,7 @@ from collections.abc import Iterable
 from dataclasses import asdict, dataclass
 from typing import Any, Protocol

+import prometheus_client
 import torch
 from pydantic import BaseModel, Field, ValidationError, model_validator
 from typing_extensions import Self
@@ -1233,6 +1234,87 @@ class PerfMetricsLogging:
        self.reset()


+#### Prometheus Integration ####
+
+
+class PerfMetricsProm:
+    """Record performance metrics in Prometheus.
+
+    Average TFLOPS (tera floating-point operations per second) can be
+    calculated using a PromQL query:
+
+      rate(vllm:estimated_flops_per_gpu_total[1m]) / 1e12
+
+    Average memory bandwidth in GB/s can be calculated using:
+
+      (rate(vllm:estimated_read_bytes_per_gpu_total[1m]) +
+       rate(vllm:estimated_write_bytes_per_gpu_total[1m])) / 1e9
+    """
+
+    _counter_cls = prometheus_client.Counter
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        labelnames: list[str],
+        per_engine_labelvalues: dict[int, list[object]],
+    ):
+        counter_flops = self._counter_cls(
+            name="vllm:estimated_flops_per_gpu_total",
+            documentation=(
+                "Estimated number of floating point operations per GPU "
+                "(for Model Flops Utilization calculations)."
+            ),
+            labelnames=labelnames,
+        )
+        self.counter_flops = make_per_engine(counter_flops, per_engine_labelvalues)
+
+        counter_read_bytes = self._counter_cls(
+            name="vllm:estimated_read_bytes_per_gpu_total",
+            documentation=(
+                "Estimated number of bytes read from memory per GPU "
+                "(for Model Flops Utilization calculations)."
+            ),
+            labelnames=labelnames,
+        )
+        self.counter_read_bytes = make_per_engine(
+            counter_read_bytes, per_engine_labelvalues
+        )
+
+        counter_write_bytes = self._counter_cls(
+            name="vllm:estimated_write_bytes_per_gpu_total",
+            documentation=(
+                "Estimated number of bytes written to memory per GPU "
+                "(for Model Flops Utilization calculations)."
+            ),
+            labelnames=labelnames,
+        )
+        self.counter_write_bytes = make_per_engine(
+            counter_write_bytes, per_engine_labelvalues
+        )
+
+    def observe(self, perf_stats: PerfStats, engine_idx: int = 0):
+        if not (
+            perf_stats.num_flops_per_gpu
+            or perf_stats.num_read_bytes_per_gpu
+            or perf_stats.num_write_bytes_per_gpu
+        ):
+            return
+        self.counter_flops[engine_idx].inc(perf_stats.num_flops_per_gpu)
+        self.counter_read_bytes[engine_idx].inc(perf_stats.num_read_bytes_per_gpu)
+        self.counter_write_bytes[engine_idx].inc(perf_stats.num_write_bytes_per_gpu)
+
+
+def make_per_engine(
+    counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[object]]
+):
+    """Create a counter for each label value."""
+    return {
+        idx: counter.labels(*labelvalues)
+        for idx, labelvalues in per_engine_labelvalues.items()
+    }
+
+
 ## util functions


--- a/vllm/v1/metrics/ray_wrappers.py
+++ b/vllm/v1/metrics/ray_wrappers.py
@@ -4,6 +4,7 @@ import time

 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorPrometheus
 from vllm.v1.metrics.loggers import PrometheusStatLogger
+from vllm.v1.metrics.perf import PerfMetricsProm
 from vllm.v1.spec_decode.metrics import SpecDecodingProm

 try:
@@ -179,6 +180,16 @@ class RayKVConnectorPrometheus(KVConnectorPrometheus):
    _histogram_cls = RayHistogramWrapper


+class RayPerfMetricsProm(PerfMetricsProm):
+    """
+    RayPerfMetricsProm is used by RayMetrics to log Ray
+    metrics. Provides the same MFU metrics as PerfMetricsProm
+    uses Ray's util.metrics library.
+    """
+
+    _counter_cls = RayCounterWrapper
+
+
 class RayPrometheusStatLogger(PrometheusStatLogger):
    """RayPrometheusStatLogger uses Ray metrics instead."""

@@ -187,6 +198,7 @@ class RayPrometheusStatLogger(PrometheusStatLogger):
    _histogram_cls = RayHistogramWrapper
    _spec_decoding_cls = RaySpecDecodingProm
    _kv_connector_cls = RayKVConnectorPrometheus
+    _perf_metrics_cls = RayPerfMetricsProm

    @staticmethod
    def _unregister_vllm_metrics():